Load R packages

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(rstatix)
library(expss)
library(ClusterR)
library(DESeq2) ; library(biomaRt)
library(knitr)

Load preprocessed dataset (code in 2.1.Preprocessing_pipeline.Rmd)

# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame




2.3.1 Exploratory analysis


Distribution of sample features by diagnosis


Diagnosis and brain region seem to be balanced except for the frontal lobe, where there are more control samples than ASD ones

table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Brain_lobe = 'Brain Lobe', 
                                      Batch = 'Batch', Sex = 'Gender')

cro(table_info$Diagnosis, list(table_info$Brain_lobe,total()))
 Brain Lobe     #Total 
 Frontal   Occipital   Parietal   Temporal   
 Diagnosis 
   CTL  13 8 8 6   35
   ASD  8 13 12 12   45
   #Total cases  21 21 20 18   80


There are many more Male than Female samples, but Diagnosis and Gender seem to be balanced

cro(table_info$Diagnosis, list(table_info$Sex, total()))
 Gender     #Total 
 F   M   
 Diagnosis 
   CTL  6 29   35
   ASD  9 36   45
   #Total cases  15 65   80


The difference in age between diagnosis groups is not statistically significant. The box plots look different between them but its probably because they have very few points each (35 and 45), so they are a bit noisy, looking at the individual points it seems like the main difference could be a few very young samples in the ASD group.

datMeta %>% ggplot(aes(Diagnosis, Age, fill = Diagnosis)) + geom_boxplot() + 
            geom_jitter(color='gray', size=2, alpha = 0.5) +
            stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) + 
            theme_minimal() + theme(legend.position = 'none')



Visualisations of the samples


PCA plots


The first principal component separates the samples by diagnosis groups perfectly, there is no recognisable pattern in the samples coloured by brain lobe

pca = datExpr %>% t %>% prcomp

plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
            mutate('ID'= datExpr %>% colnames %>% substring(2)) %>% 
            left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            dplyr::select('ID','PC1','PC2','Diagnosis','Brain_lobe') %>% 
            mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              ggtitle('Samples coloured by Diagnosis') + coord_fixed() + theme_minimal()

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=Brain_lobe, shape=Brain_lobe)) + geom_point(alpha = 0.8) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              ggtitle('Samples coloured by Brain Lobe') + coord_fixed() + labs(color = 'Brain Lobe', shape = 'Brain Lobe') + 
              theme_minimal()

grid.arrange(p1, p2, nrow = 1)

rm(pca, p1, p2)


t-SNE plots


No matter the value of the perplexity parameter, Diagnosis is always an important factor in the way samples are positioned

perplexities = c(2,5,10,20)
ps = list()

for(i in 1:length(perplexities)){
  set.seed(123)
  tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
  plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
              mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
              dplyr::select('C1','C2','Diagnosis','Subject_ID') %>%
              mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.7) + theme_minimal() +
            ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none') + coord_fixed()
}

grid.arrange(grobs=ps, nrow=2)

No matter the value of the perplexity parameter, brain lobe doesn’t seem to play any role in the way the points are positioned

perplexities = c(2,5,10,20)
ps = list()

for(i in 1:length(perplexities)){
  set.seed(123)
  tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
  plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
              mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
              dplyr::select('C1','C2','Brain_lobe','Subject_ID')
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Brain_lobe, shape=Brain_lobe)) + geom_point(alpha = 0.7) + theme_minimal() +
            ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none') + coord_fixed()
}

grid.arrange(grobs=ps, nrow=2)


As we saw above, the big clusters of points generally correspond to diagnosis groups, but we can see there are smaller clusters of two or three points. When we colour the samples by diagnosis we can see these small clusters correspond to the different samples extracted from the same subject.

perplexities = c(2,5,10,20)
ps = list()

for(i in 1:length(perplexities)){
  set.seed(123)
  tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
  plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
              mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
              dplyr::select('C1','C2','Brain_lobe','Subject_ID')
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(alpha = 0.6) + theme_minimal() +
            ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none') + coord_fixed()
}

grid.arrange(grobs=ps, nrow=2)


Interactive version of the last plot above so we can corroborate the subject ID’s from each sample (because having so many different subject IDs, subjects assigned similar colours may be confused)

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(aes(id=Subject_ID)) + theme_minimal() + 
         theme(legend.position='none') + ggtitle('t-SNE Perplexity = 20 coloured by Subject ID'))



Visualisations of the genes


PCA Plot


The first principal component explains over 99% of the variance in the dataset and is strongly related to the mean level of expression of the genes

pca = datExpr %>% prcomp

plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() + 
              scale_color_viridis() + ggtitle('PCA') + coord_fixed() +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) + 
              theme(legend.position = 'bottom', legend.key.width=unit(3, 'cm')) + labs(color='Mean Expression ')

The correlation between the mean expression of the genes and the 1st principal component is 1


t-SNE


Independently of the perplexity parameter, the mean level of expression of the genes seems to continue to be the main feature that characterises the genes

perplexities = c(1,2,5,10,50,100)
ps = list()

for(i in 1:length(perplexities)){
  
  file_name = paste0('./../Data/Results/tsne_perplexity_',perplexities[i],'.csv')
  
  # Check if file with t-sne coordinates already exists, if not, calculate coordinates
  if(file.exists(file_name)){
    tsne = read.csv(file_name)
  } else {
    set.seed(123)
    tsne = datExpr %>% Rtsne(perplexity=perplexities[i])
    tsne_coords = cbind(tsne$Y, rownames(datExpr))
    colnames(tsne_coords) = c('C1','C2','ID')
    write.csv(tsne_coords, file_name, row.names=F)
    tsne = tsne_coords
  }
  
  # Plot results
  plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() + coord_fixed() +
            scale_color_viridis() + ggtitle(paste0('Perplexity = ', perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)



Mean level of expression


Samples belonging to the ASD group have statistically significantly higher levels of expression than the Control group (this was unexpected)

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID'))

plot_data %>% ggplot(aes(Diagnosis, Mean, fill = Diagnosis)) + 
              geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
              stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) + 
              ylab('Mean Expression') + theme_minimal() + theme(legend.position = 'none')


Samples extracted from the frontal lobe have a slightly lower mean level of expression than the samples from the other lobes, but this difference is not statistically significant

wt = plot_data %>% wilcox_test(Mean~Brain_lobe, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = .006
base = 8.175
pos_y_comparisons = c(base, base+increase, base+3*increase, base, base+2*increase, base)

p1 = plot_data %>% ggplot(aes(Brain_lobe, Mean)) + 
              geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3, aes(fill = Brain_lobe)) + 
              stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .02) +
              xlab('Brain Lobe') + ylab('Mean Expression') + theme(legend.position = 'none') + 
              theme_minimal() + theme(legend.position = 'none')


p2 = plot_data %>% mutate(frontal_lobe = ifelse(Brain_lobe == 'Frontal', 'Frontal Lobe', 'Other lobes')) %>%
              ggplot(aes(frontal_lobe, Mean, fill = frontal_lobe)) + 
              geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
              stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) + 
              xlab('Brain Lobe') + ylab('Mean Expression') + theme(legend.position = 'none') + 
              theme_minimal() + theme(legend.position = 'none')

grid.arrange(p1, p2, nrow=1, widths = c(0.6,0.4))




2.3.2 Differential Expression Analysis


Log fold change threshold


There are 4473 DE genes using a threshold of LFC=0 (~28% of the total number of genes). As the threshold increases, the number of DE genes quickly decreases.

lfc_list = seq(0, 0.3, 0.01)

n_genes = nrow(datExpr)

# PCA
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp

# Initialise data frame to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=colnames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))

pca_samps_old = pcas_samps

for(lfc in lfc_list){
  
  # Recalculate DE_info with the new threshold (p-values change) an filter DE genes
  DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>%
             mutate('ID'=rownames(.)) %>% filter(padj<0.05)
  
  datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
  n_genes = c(n_genes, nrow(DE_genes))
  
  # Calculate PCAs
  datExpr_pca_samps = datExpr_DE %>% t %>% prcomp

  # Create new DF entries
  pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=colnames(datExpr), 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
  
  # Change PC sign if necessary
  if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
  if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
  
  pca_samps_old = pca_samps_new
  
  # Update DFs
  pcas_samps = rbind(pcas_samps, pca_samps_new)
  
}

# Add Diagnosis information
pcas_samps = pcas_samps %>% mutate('ID'=substring(ID,2)) %>% 
             left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
             dplyr::select(ID, PC1, PC2, lfc, Diagnosis)

# Plot change of number of genes
ggplotly(data.frame('lfc'=lfc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) + 
         geom_point() + geom_line() + theme_minimal() + xlab('Log Fold Change Magnitude') + ylab('DE Genes') +
         ggtitle('Number of Differentially Expressed genes when modifying filtering threshold'))
rm(lfc_list, n_genes, lfc, pca_samps_new, pca_samps_old, datExpr_pca_samps)



Set of differentially expressed genes


Volcano plot

library(ggrepel)

ggplotly(genes_info %>% mutate(gene_name = datGenes$hgnc_symbol) %>%
        ggplot(aes(shrunken_log2FoldChange, padj, color=significant)) + 
        geom_point(alpha=0.2, aes(id = gene_name)) + scale_y_sqrt() +
        xlab('LFC') + ylab('Adjusted p-value') + theme_minimal() + labs(color = 'DE')) #+
               #geom_label_repel(aes(label=ifelse(shrunken_log2FoldChange>0.5 | shrunken_log2FoldChange< -0.39, 
               #                                  as.character(gene_name),'')), direction = 'y', nudge_y = 0.01)


Most of the genes on this list are overexpressed in ASD and do not have a Neuronal tag

top_genes = genes_info %>% mutate(Gene = datGenes$hgnc_symbol) %>% filter(significant == TRUE) %>%
            arrange(-abs(shrunken_log2FoldChange)) %>% top_n(25, wt=abs(shrunken_log2FoldChange)) %>%
            mutate(Neuronal = as.logical(Neuronal), LFC = shrunken_log2FoldChange, n = 1:25)

kable(top_genes %>% dplyr::select(n, Gene, LFC, padj, Neuronal), caption = 'Top 25 DE genes ordered by LFC magnitude')
Top 25 DE genes ordered by LFC magnitude
n Gene LFC padj Neuronal
1 HSPA6 0.6915903 0.00e+00 FALSE
2 THBS1 0.5417591 0.00e+00 FALSE
3 IGF2BP2 0.5348835 5.00e-07 FALSE
4 SHD -0.5289553 0.00e+00 FALSE
5 TNFRSF10D 0.5237673 2.00e-07 FALSE
6 BNC2 0.5015931 0.00e+00 FALSE
7 VAMP1 -0.4946280 0.00e+00 TRUE
8 WEE1 0.4915898 0.00e+00 TRUE
9 SERPINH1 0.4862111 6.30e-06 FALSE
10 NFKB2 0.4842801 0.00e+00 FALSE
11 CP 0.4796249 5.50e-06 FALSE
12 ITGA5 0.4749568 0.00e+00 FALSE
13 STC1 0.4722304 1.72e-05 FALSE
14 GPRC5A 0.4610657 0.00e+00 FALSE
15 SPTLC3 0.4596896 4.00e-07 FALSE
16 GBP2 0.4574062 2.40e-06 FALSE
17 SLC11A1 0.4572782 5.10e-06 FALSE
18 VIM 0.4550495 0.00e+00 TRUE
19 PODN 0.4546192 0.00e+00 FALSE
20 ANXA2 0.4545567 5.40e-06 FALSE
21 OLFML2B 0.4511181 0.00e+00 FALSE
22 HSPB1 0.4494746 0.00e+00 FALSE
23 TIMP1 0.4466072 1.82e-05 FALSE
24 CD93 0.4420008 3.40e-06 FALSE
25 NUPR1 0.4369911 1.64e-05 FALSE


PCA plot of samples

PCA plot of genes characterised by each LFC threshold’s corresponding set of DE genes.

  • The frame corresponding to LFC = -1 doesn’t correspond to any LFC, all of the genes in the dataset were used to create this plot

  • LFC thresholds from 0 to 0.1 seem to separate the samples by diagnosis better than the rest

  • LFC thresholds above 0.2 aren’t able to separate the samples by diagnosis any more

  • The PC values for each LFC were scaled so they were easier to compare (without this scale, as the LFC increases, the points begin to cluster more and more tightly at the centre of the plot)

ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(lfc==-1,-1,lfc)) %>% 
         ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + 
         geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.7) + coord_fixed() +
         theme_minimal() + ggtitle('PCA plot of samples modifying filtering threshold'))
# # Figure for thesis:
# p1 = pcas_samps %>% filter(lfc == 0) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + 
#      geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0') + theme(legend.position = 'none') + coord_fixed()
# p2 = pcas_samps %>% filter(lfc == 0.1) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + 
#      geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.1') + theme(legend.position = 'none') + coord_fixed()
# p3 = pcas_samps %>% filter(lfc == 0.2) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + 
#      geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.2') + theme(legend.position = 'none') + coord_fixed()
# p4 = pcas_samps %>% filter(lfc == 0.3) %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + 
#      geom_point(alpha=0.7) + theme_minimal() + ggtitle('LFC = 0.3') + coord_fixed()
# ggarrange(p1,p2,p3,p4, nrow=1, common.legend = TRUE, legend='bottom', widths = c(0.22, 0.25, 0.28, 0.24))


PCA plot of samples characterised by all of the genes vs characterised only by the genes found to be differentially expressed (this plot is just a side-by-side comparison of two frames from the plot above)

pca = datExpr %>% t %>% prcomp

plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
            mutate('ID'= datExpr %>% colnames %>% substring(2)) %>% 
            left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            dplyr::select('ID','PC1','PC2','Diagnosis') %>% 
            mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              ggtitle('Characterising samples with all genes') +
              coord_fixed() + theme_minimal() + theme(legend.position = 'none')


pca = datExpr[genes_info$significant==TRUE,] %>% t %>% prcomp

plot_data = pca$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
            mutate('ID'= datExpr %>% colnames %>% substring(2)) %>% 
            left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            dplyr::select('ID','PC1','PC2','Diagnosis') %>% 
            mutate('Diagnosis' = factor(Diagnosis, levels=c('CTL','ASD')))

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + geom_point(alpha = 0.8) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              ggtitle('Characterising samples with DE genes') +
              coord_fixed() + theme_minimal()

ggarrange(p1,p2, nrow=1, common.legend = TRUE, legend='bottom')

rm(p1, p2, pca, plot_data)


PCA plot of genes coloured by their LFC


The second principal component is strongly related to the LFC of the genes and when visualising only the differentially expressed genes, they form two clouds of points, one with the overexpressed genes and another with the underexpressed genes.

# All genes
pca = datExpr %>% prcomp

plot_data = genes_info %>% mutate(PC1 = pca$x[,1], PC2 = pca$x[,2])

pos_zero = -min(plot_data$shrunken_log2FoldChange)/(max(plot_data$shrunken_log2FoldChange)-min(plot_data$shrunken_log2FoldChange))
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=shrunken_log2FoldChange)) + geom_point(alpha=0.5) +
    scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'), 
                          values=c(0, pos_zero-0.15, pos_zero, pos_zero+0.15, 1)) +
    xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
    ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) + 
    theme_minimal() + ggtitle('PCA of all genes') + 
    theme(legend.position = 'bottom', legend.key.width=unit(2,'cm')) + labs(color = 'LFC ')

# Only DE genes
pca = datExpr[genes_info$significant,] %>% prcomp

plot_data = genes_info %>% dplyr::filter(significant == TRUE) %>% mutate(PC1 = pca$x[,1], PC2 = pca$x[,2])

pos_zero = -min(plot_data$shrunken_log2FoldChange)/(max(plot_data$shrunken_log2FoldChange)-min(plot_data$shrunken_log2FoldChange))
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=shrunken_log2FoldChange)) + geom_point(alpha=0.5) +
    scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'), 
                          values=c(0, pos_zero-0.15, pos_zero, pos_zero+0.15, 1)) +
    xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
    ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) + 
    theme_minimal() + ggtitle('PCA of differentially expressed genes') + 
    theme(legend.position = 'bottom', legend.key.width=unit(2,'cm')) + labs(color = 'LFC ')

ggarrange(p1,p2, nrow=1, common.legend = TRUE, legend='bottom')

rm(pca, plot_data, pos_zero, p1, p2)


Level of expression comparison between over and underexpressed genes


The difference in mean level of expression between over and underexpressed genes is statistically significant with a p=value lower than \(10^{-4}\)

plot_data = data.frame('MeanExpr' = rowMeans(datExpr), 
                       'Group' = ifelse(genes_info$shrunken_log2FoldChange>0,'overexpressed','underexpressed')) %>%
            mutate(Group = factor(Group, levels = c('underexpressed','overexpressed')))

plot_data[genes_info$significant==TRUE,] %>% ggplot(aes(Group, MeanExpr, fill = Group)) + 
              geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
              stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) + 
              ylab('Mean Expression') + xlab('') + theme(legend.position = 'none') + 
              theme_minimal() + theme(legend.position = 'none')



Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggrepel_0.9.1               knitr_1.32                 
##  [3] biomaRt_2.40.5              DESeq2_1.24.0              
##  [5] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
##  [7] BiocParallel_1.18.1         matrixStats_0.60.1         
##  [9] Biobase_2.44.0              GenomicRanges_1.36.1       
## [11] GenomeInfoDb_1.20.0         IRanges_2.18.3             
## [13] S4Vectors_0.22.1            BiocGenerics_0.30.0        
## [15] ClusterR_1.2.1              gtools_3.9.2               
## [17] expss_0.10.7                rstatix_0.6.0              
## [19] Rtsne_0.15                  ggpubr_0.2.5               
## [21] magrittr_2.0.1              GGally_1.5.0               
## [23] gridExtra_2.3               viridis_0.6.1              
## [25] viridisLite_0.4.0           RColorBrewer_1.1-2         
## [27] plotlyutils_0.0.0.9000      plotly_4.9.2               
## [29] glue_1.4.2                  reshape2_1.4.4             
## [31] forcats_0.5.0               stringr_1.4.0              
## [33] dplyr_1.0.1                 purrr_0.3.4                
## [35] readr_1.3.1                 tidyr_1.1.0                
## [37] tibble_3.1.2                ggplot2_3.3.5              
## [39] tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.2.1        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] gmp_0.6-2              crosstalk_1.1.1        digest_0.6.27         
##  [10] htmltools_0.5.2        fansi_0.5.0            checkmate_2.0.0       
##  [13] memoise_2.0.0          cluster_2.1.0          openxlsx_4.2.4        
##  [16] annotate_1.62.0        modelr_0.1.6           prettyunits_1.1.1     
##  [19] jpeg_0.1-9             colorspace_2.0-2       blob_1.2.2            
##  [22] rvest_0.3.5            haven_2.2.0            xfun_0.25             
##  [25] crayon_1.4.1           RCurl_1.98-1.4         jsonlite_1.7.2        
##  [28] genefilter_1.66.0      survival_3.2-7         gtable_0.3.0          
##  [31] zlibbioc_1.30.0        XVector_0.24.0         car_3.0-7             
##  [34] abind_1.4-5            scales_1.1.1           DBI_1.1.1             
##  [37] Rcpp_1.0.7             progress_1.2.2         xtable_1.8-4          
##  [40] htmlTable_1.13.3       foreign_0.8-76         bit_4.0.4             
##  [43] Formula_1.2-4          htmlwidgets_1.5.3      httr_1.4.2            
##  [46] acepack_1.4.1          ellipsis_0.3.2         farver_2.1.0          
##  [49] pkgconfig_2.0.3        reshape_0.8.8          XML_3.99-0.3          
##  [52] nnet_7.3-14            sass_0.4.0             dbplyr_1.4.2          
##  [55] locfit_1.5-9.4         utf8_1.2.2             labeling_0.4.2        
##  [58] tidyselect_1.1.1       rlang_0.4.11           AnnotationDbi_1.46.1  
##  [61] cachem_1.0.6           munsell_0.5.0          cellranger_1.1.0      
##  [64] tools_3.6.3            cli_3.0.1              generics_0.1.0        
##  [67] RSQLite_2.2.0          broom_0.7.0            evaluate_0.14         
##  [70] fastmap_1.1.0          yaml_2.2.1             bit64_4.0.5           
##  [73] fs_1.5.0               zip_2.2.0              xml2_1.3.2            
##  [76] compiler_3.6.3         rstudioapi_0.13        curl_4.3.2            
##  [79] png_0.1-7              ggsignif_0.6.2         reprex_0.3.0          
##  [82] geneplotter_1.62.0     bslib_0.3.0            stringi_1.7.4         
##  [85] highr_0.9              lattice_0.20-41        Matrix_1.2-18         
##  [88] vctrs_0.3.8            pillar_1.6.2           lifecycle_1.0.0       
##  [91] jquerylib_0.1.4        cowplot_1.1.1          data.table_1.14.0     
##  [94] bitops_1.0-7           R6_2.5.1               latticeExtra_0.6-29   
##  [97] rio_0.5.16             assertthat_0.2.1       withr_2.4.2           
## [100] GenomeInfoDbData_1.2.1 hms_1.1.0              grid_3.6.3            
## [103] rpart_4.1-15           rmarkdown_2.7          carData_3.0-4         
## [106] lubridate_1.7.10       base64enc_0.1-3